Analysis by LEA¶

In this notebook, about 7,800 districts are analysed. The data includes the enrollment of males and females in various programs and high school level courses. Districts with low enrollment are removed from the analysis. The analysis consists of principal component analysis (PCA), K-Means clustering, linear discriminant analysis (LDA), correlation, covariance, and multiple regression predicting poverty rate based on enrollment.

In [1]:
query="""
SELECT 
	rls.leaid
	,min(rls.lea_name) AS lea_name
	,min(rls.lea_state) as lea_state
	,sum(GREATEST(advmath.tot_mathenr_advm_m,0)) AS advmath_m_enr
	,sum(GREATEST(advmath.tot_mathenr_advm_f,0)) AS advmath_f_enr
	,sum(GREATEST(advpl.TOT_APEXAM_NONE_M,0)) AS advpl_m_noexam
	,sum(GREATEST(advpl.TOT_APEXAM_NONE_F,0)) AS advpl_f_noexam
	,sum(GREATEST(alg1.TOT_ALGPASS_GS0910_M,0)) AS alg1_m_0910_passed
	,sum(GREATEST(alg1.TOT_ALGPASS_GS1112_M,0)) AS alg1_m_1112_passed
	,sum(GREATEST(alg1.TOT_ALGPASS_GS0910_F,0)) AS alg1_f_0910_passed
	,sum(GREATEST(alg1.TOT_ALGPASS_GS1112_F,0)) AS alg1_f_1112_passed
	,sum(GREATEST(alg2.tot_mathenr_alg2_m,0)) AS alg2_m_enr
	,sum(GREATEST(alg2.tot_mathenr_alg2_f,0)) AS alg2_f_enr
	,sum(GREATEST(bio.TOT_SCIENR_BIOL_M,0)) AS bio_m_enr
	,sum(GREATEST(bio.TOT_SCIENR_BIOL_F,0)) AS bio_f_enr
	,sum(GREATEST(calc.TOT_MATHENR_CALC_M,0)) AS calc_m_enr
	,sum(GREATEST(calc.TOT_MATHENR_CALC_F,0)) AS calc_f_enr
	,sum(GREATEST(chem.TOT_SCIENR_CHEM_M,0)) AS chem_m_enr
	,sum(GREATEST(chem.TOT_SCIENR_CHEM_F,0)) AS chem_f_enr
	,sum(GREATEST(dual.TOT_DUAL_M,0)) AS dual_m_enr
	,sum(GREATEST(dual.TOT_DUAL_F,0)) AS dual_f_enr
	,sum(GREATEST(enr.tot_enr_m,0)) AS total_m_enr
	,sum(GREATEST(enr.tot_enr_f,0)) AS total_f_enr
	,sum(GREATEST(enr.SCH_ENR_LEP_M,0)) AS enr_lep_m
	,sum(GREATEST(enr.SCH_ENR_LEP_F,0)) AS enr_lep_f
	,sum(GREATEST(enr.SCH_ENR_504_M,0)) AS enr_504_m
	,sum(GREATEST(enr.SCH_ENR_504_F,0)) AS enr_504_f
	,sum(GREATEST(enr.SCH_ENR_IDEA_M,0)) AS enr_idea_m
	,sum(GREATEST(enr.SCH_ENR_IDEA_F,0)) AS enr_idea_f
	,sum(GREATEST(geo.TOT_MATHENR_GEOM_M,0)) AS geo_m_enr
	,sum(GREATEST(geo.TOT_MATHENR_GEOM_F,0)) AS geo_f_enr
	,sum(GREATEST(phys.TOT_SCIENR_PHYS_M,0)) AS phys_m_enr
	,sum(GREATEST(phys.TOT_SCIENR_PHYS_F,0)) AS phys_f_enr
	,sum(GREATEST(satact.TOT_SATACT_M,0)) AS satact_m
	,sum(GREATEST(satact.TOT_SATACT_F,0)) AS satact_f
	,avg(saipe.totalpopulation) AS totalpopulation 
	,avg(saipe.population5_17) AS population5_17
	,avg(saipe.population5_17inpoverty) AS population5_17inpoverty
FROM ref_schema.ref_lea_sch rls
JOIN data_schema.sch_advancedmathematics advmath ON advmath.combokey = rls.combokey
JOIN data_schema.sch_advancedplacement advpl ON advpl.combokey = rls.combokey
JOIN data_schema.sch_algebrai alg1 ON alg1.combokey = rls.combokey
JOIN data_schema.sch_algebraii alg2 ON alg2.combokey = rls.combokey 
JOIN data_schema.sch_biology bio ON bio.combokey = rls.combokey 
JOIN data_schema.sch_calculus calc ON calc.combokey = rls.combokey 
JOIN data_schema.sch_chemistry chem ON chem.combokey = rls.combokey 
JOIN data_schema.sch_dualenrollment dual ON dual.combokey = rls.combokey 
JOIN data_schema.sch_enrollment enr ON enr.combokey = rls.combokey 
JOIN data_schema.sch_geometry geo ON geo.combokey = rls.combokey 
JOIN data_schema.sch_physics phys ON phys.combokey = rls.combokey 
JOIN data_schema.sch_satandact satact ON satact.combokey = rls.combokey 
JOIN data_schema.sch_schoolcharacteristics chr ON chr.combokey = rls.combokey 
JOIN data_schema.saipe_ussd17 saipe ON saipe.leaid = rls.leaid
WHERE chr.hs_only = TRUE
group by rls.leaid
order by leaid;
"""
In [2]:
from sqlalchemy import create_engine
db_params = {
    "database": "postgres",
    "user": "postgres",
    "password": "pwd123",
    "host": "postgres-db",
    "port": "5432"
}
connection_string = f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['database']}"
engine = create_engine(connection_string)
In [3]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import plotly.io as pio
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from kneed import KneeLocator
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
In [4]:
# df = pd.read_csv('LEA_agg_data.csv')
In [5]:
df = pd.read_sql(query, engine)
In [6]:
df.columns
Out[6]:
Index(['leaid', 'lea_name', 'lea_state', 'advmath_m_enr', 'advmath_f_enr',
       'advpl_m_noexam', 'advpl_f_noexam', 'alg1_m_0910_passed',
       'alg1_m_1112_passed', 'alg1_f_0910_passed', 'alg1_f_1112_passed',
       'alg2_m_enr', 'alg2_f_enr', 'bio_m_enr', 'bio_f_enr', 'calc_m_enr',
       'calc_f_enr', 'chem_m_enr', 'chem_f_enr', 'dual_m_enr', 'dual_f_enr',
       'total_m_enr', 'total_f_enr', 'enr_lep_m', 'enr_lep_f', 'enr_504_m',
       'enr_504_f', 'enr_idea_m', 'enr_idea_f', 'geo_m_enr', 'geo_f_enr',
       'phys_m_enr', 'phys_f_enr', 'satact_m', 'satact_f', 'totalpopulation',
       'population5_17', 'population5_17inpoverty'],
      dtype='object')
In [7]:
df.describe()
Out[7]:
advmath_m_enr advmath_f_enr advpl_m_noexam advpl_f_noexam alg1_m_0910_passed alg1_m_1112_passed alg1_f_0910_passed alg1_f_1112_passed alg2_m_enr alg2_f_enr ... enr_idea_f geo_m_enr geo_f_enr phys_m_enr phys_f_enr satact_m satact_f totalpopulation population5_17 population5_17inpoverty
count 7826.000000 7826.000000 7826.000000 7826.000000 7826.000000 7826.000000 7826.000000 7826.000000 7826.000000 7826.000000 ... 7826.000000 7826.000000 7826.000000 7826.000000 7826.000000 7826.000000 7826.000000 7.826000e+03 7.826000e+03 7826.000000
mean 125.057884 134.328009 39.305776 46.637618 121.851776 7.589318 118.630718 5.891899 159.460900 162.658191 ... 67.333248 186.335165 180.875160 98.532456 82.388832 186.506645 209.065551 3.912937e+04 6.141668e+03 1072.236392
std 469.030397 495.005708 154.167803 174.021223 457.476194 32.207637 436.891689 26.552504 622.387927 627.130531 ... 278.555564 736.621938 695.335914 368.763361 329.220959 743.616365 852.540876 1.549392e+05 2.333012e+04 6041.762535
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.900000e+01 3.000000e+00 0.000000
25% 9.000000 11.000000 0.000000 0.000000 19.000000 0.000000 17.000000 0.000000 23.000000 24.000000 ... 11.000000 29.000000 27.000000 5.000000 4.000000 22.000000 25.000000 6.226750e+03 1.004250e+03 126.000000
50% 34.000000 37.000000 2.000000 3.000000 45.000000 1.000000 44.000000 1.000000 57.000000 58.000000 ... 26.000000 66.000000 64.000000 24.000000 18.000000 60.000000 67.500000 1.379900e+04 2.168500e+03 282.000000
75% 98.000000 106.000000 25.000000 30.000000 103.000000 5.000000 100.000000 3.000000 132.000000 135.000000 ... 59.000000 148.000000 146.000000 79.000000 66.000000 150.000000 169.000000 3.066575e+04 4.799000e+03 719.750000
max 16168.000000 18784.000000 5640.000000 6744.000000 24083.000000 979.000000 22173.000000 852.000000 30024.000000 29440.000000 ... 15899.000000 36450.000000 33054.000000 12950.000000 11428.000000 38090.000000 42968.000000 8.622698e+06 1.237149e+06 304745.000000

8 rows × 35 columns

In [8]:
exclude_cols = ['leaid', 'lea_name', 'lea_state', 
                'totalpopulation', 'population5_17',
                'population5_17inpoverty', 'total_enrollment']
columns_to_modify = df.columns.difference(exclude_cols)
df[columns_to_modify] = df[columns_to_modify].clip(lower=0)
In [9]:
enrollment_sum = df['total_m_enr'] + df['total_f_enr']
df['total_enrollment'] = enrollment_sum
columns_to_modify = df.columns.difference(exclude_cols)
df[columns_to_modify] = df[columns_to_modify].div(enrollment_sum, axis=0).fillna(0)
In [10]:
df[enrollment_sum <= 10][['total_enrollment','leaid',
                         'lea_state','totalpopulation']]
Out[10]:
total_enrollment leaid lea_state totalpopulation
765 6 0804500 CO 1853.0
1491 6 1726880 IL 159563.0
2625 0 2509360 MA 54172.0
3164 8 2730870 MN 4559.0
3699 8 3027930 MT 195.0
3701 5 3028170 MT 359.0
4710 7 3812930 ND 1230.0
5889 2 4217130 PA 4671.0
6840 7 4831470 TX 2079.0
7058 5 4844710 TX 1986.0
7331 7 5305040 WA 515.0
In [11]:
df = df[enrollment_sum > 10]
df = df.reset_index(drop=True)
In [12]:
df['5_17_poverty_percent'] = df['population5_17inpoverty']/df['population5_17']
In [13]:
df.columns.difference(exclude_cols)
Out[13]:
Index(['5_17_poverty_percent', 'advmath_f_enr', 'advmath_m_enr',
       'advpl_f_noexam', 'advpl_m_noexam', 'alg1_f_0910_passed',
       'alg1_f_1112_passed', 'alg1_m_0910_passed', 'alg1_m_1112_passed',
       'alg2_f_enr', 'alg2_m_enr', 'bio_f_enr', 'bio_m_enr', 'calc_f_enr',
       'calc_m_enr', 'chem_f_enr', 'chem_m_enr', 'dual_f_enr', 'dual_m_enr',
       'enr_504_f', 'enr_504_m', 'enr_idea_f', 'enr_idea_m', 'enr_lep_f',
       'enr_lep_m', 'geo_f_enr', 'geo_m_enr', 'phys_f_enr', 'phys_m_enr',
       'satact_f', 'satact_m', 'total_f_enr', 'total_m_enr'],
      dtype='object')
In [14]:
df.head()
Out[14]:
leaid lea_name lea_state advmath_m_enr advmath_f_enr advpl_m_noexam advpl_f_noexam alg1_m_0910_passed alg1_m_1112_passed alg1_f_0910_passed ... geo_f_enr phys_m_enr phys_f_enr satact_m satact_f totalpopulation population5_17 population5_17inpoverty total_enrollment 5_17_poverty_percent
0 0100005 Albertville City AL 0.097308 0.125604 0.001380 0.002070 0.162181 0.000690 0.130435 ... 0.110421 0.116632 0.077985 0.122153 0.128364 21786.0 4115.0 1546.0 1449 0.375699
1 0100006 Marshall County AL 0.055609 0.061361 0.000000 0.000000 0.120805 0.000959 0.087248 ... 0.161074 0.000000 0.000000 0.133269 0.101630 48481.0 8762.0 2495.0 1043 0.284752
2 0100007 Hoover City AL 0.164874 0.180241 0.010099 0.016246 0.004171 0.000000 0.005049 ... 0.122503 0.020198 0.010099 0.202415 0.200659 82783.0 14679.0 1038.0 4555 0.070713
3 0100008 Madison City AL 0.117059 0.104082 0.004055 0.000811 0.108678 0.000811 0.080292 ... 0.066234 0.039200 0.023250 0.136794 0.136794 46797.0 9683.0 735.0 3699 0.075906
4 0100011 Leeds City AL 0.077869 0.065574 0.000000 0.002049 0.098361 0.000000 0.098361 ... 0.131148 0.000000 0.000000 0.092213 0.106557 11900.0 1742.0 302.0 488 0.173364

5 rows × 40 columns

PCA¶

In [15]:
ids = df['leaid'].values
lea_names = df['lea_name'].values
states = df['lea_state'].values
pop5_17 = df['population5_17']
pov5_17 = df['5_17_poverty_percent']
In [16]:
ids = df['leaid'].values

# Step 1: Subset the DataFrame
subset_df = df[df.columns.difference(exclude_cols)]
for_pca_use = df[df['total_enrollment'] > 15][df.columns.difference(exclude_cols)]

# Step 2: Standardize the data
scaler = StandardScaler()
standardized_data = scaler.fit_transform(subset_df)
pca_data = scaler.fit_transform(for_pca_use)

# Step 3: Compute covariance matrix, eigenvectors, and eigenvalues for PCA
cov_matrix = np.cov(pca_data, rowvar=False)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sort eigenvectors by eigenvalue size (descending order)
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvectors = eigenvectors[:, sorted_indices]
eigenvalues = eigenvalues[sorted_indices]

# Step 4: Project data onto the top 3 principal components
projected_data = np.dot(pca_data, eigenvectors[:, :3])

# Step 5: Create an interactive 3D plot using Plotly
trace = go.Scatter3d(
    x=projected_data[:, 0],
    y=projected_data[:, 1],
    z=projected_data[:, 2],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.8),
    text=[f"LEA ID: {i}, {state}<br>LEA Name: {lea}<br>5_17 Pop: {int(pop)}<br>5_17 Pov: {100*pov:.2f}%" 
          for i, lea, state, pop, pov in zip(ids, lea_names, states, pop5_17, pov5_17)],  
    # Display ID, School Name, and LEA Name when hovering
    hoverinfo="text+x+y+z"
)

PC1_range = [projected_data[:, 0].min(),projected_data[:, 0].max()]
PC2_range = [projected_data[:, 1].min(),projected_data[:, 1].max()]
PC3_range = [projected_data[:, 2].min(),projected_data[:, 2].max()]
for i in range(1,4):
    exec(f"extension = 0.1*(PC{i}_range[1] - PC{i}_range[0])")
    exec(f"PC{i}_range[0] -= extension")
    exec(f"PC{i}_range[1] += extension")

layout = go.Layout(
    title="Data Projected on Top 3 Principal Components",
    scene=dict(
        xaxis=dict(
            title="Principal Component 1",
            range=[projected_data[:, 0].min(), projected_data[:, 0].max()]  
        ),
        yaxis=dict(
            title="Principal Component 2"
        ),
        zaxis=dict(
            title="Principal Component 3"
        )
    )
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)
In [17]:
extreme_PC1 = df.iloc[np.argsort(np.abs(projected_data[:, 0]))[-3:]]
extreme_PC1.T
Out[17]:
1288 2368 4117
leaid 1704440 2200810 3500030
lea_name Astoria CUSD 1 Jefferson Davis Parish ALAMOGORDO PUBLIC SCHOOLS
lea_state IL LA NM
advmath_m_enr 0.025862 0.080925 0.098757
advmath_f_enr 0.051724 0.106936 0.092541
advpl_m_noexam 0.0 0.0 0.004834
advpl_f_noexam 0.0 0.0 0.004834
alg1_m_0910_passed 0.043103 0.075145 0.084945
alg1_m_1112_passed 0.0 0.00289 0.000691
alg1_f_0910_passed 0.077586 0.101156 0.089088
alg1_f_1112_passed 0.0 0.008671 0.004144
alg2_m_enr 0.034483 0.069364 0.132597
alg2_f_enr 0.077586 0.089595 0.127762
bio_m_enr 0.146552 0.054913 0.186464
bio_f_enr 0.215517 0.069364 0.183702
calc_m_enr 0.034483 0.0 0.006906
calc_f_enr 0.008621 0.0 0.007597
chem_m_enr 0.017241 0.086705 0.070442
chem_f_enr 0.051724 0.106936 0.063536
dual_m_enr 0.0 0.075145 0.020028
dual_f_enr 0.025862 0.040462 0.037983
total_m_enr 0.482759 0.50289 0.495856
total_f_enr 0.517241 0.49711 0.504144
enr_lep_m 0.0 0.0 0.010359
enr_lep_f 0.0 0.0 0.007597
enr_504_m 0.017241 0.017341 0.01105
enr_504_f 0.017241 0.008671 0.008978
enr_idea_m 0.103448 0.049133 0.069061
enr_idea_f 0.043103 0.034682 0.051796
geo_m_enr 0.025862 0.069364 0.133287
geo_f_enr 0.051724 0.109827 0.149862
phys_m_enr 0.043103 0.0 0.015193
phys_f_enr 0.060345 0.0 0.004834
satact_m 0.094828 0.127168 0.0
satact_f 0.172414 0.132948 0.0
totalpopulation 1988.0 31477.0 44672.0
population5_17 321.0 5875.0 6725.0
population5_17inpoverty 70.0 1342.0 1713.0
total_enrollment 116 346 1448
5_17_poverty_percent 0.218069 0.228426 0.254721
In [18]:
pc1 = eigenvectors[:, 0]
pc2 = eigenvectors[:, 1]
In [19]:
df.columns.difference(exclude_cols)
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(pc1)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*pc1[i]:.2f}%")
Column Name         : PC1 Weight
5_17_poverty_percent: 17.12%
advmath_f_enr       : -26.03%
advmath_m_enr       : -26.97%
advpl_f_noexam      : -12.17%
advpl_m_noexam      : -13.04%
alg1_f_0910_passed  : 0.96%
alg1_f_1112_passed  : 1.50%
alg1_m_0910_passed  : 1.46%
alg1_m_1112_passed  : 3.12%
alg2_f_enr          : -19.97%
alg2_m_enr          : -19.77%
bio_f_enr           : -20.48%
bio_m_enr           : -17.42%
calc_f_enr          : -22.05%
calc_m_enr          : -23.60%
chem_f_enr          : -31.16%
chem_m_enr          : -30.86%
dual_f_enr          : -3.27%
dual_m_enr          : -3.52%
enr_504_f           : -16.03%
enr_504_m           : -16.18%
enr_idea_f          : 7.63%
enr_idea_m          : 9.71%
enr_lep_f           : 4.21%
enr_lep_m           : 4.25%
geo_f_enr           : -16.33%
geo_m_enr           : -15.94%
phys_f_enr          : -28.18%
phys_m_enr          : -29.44%
satact_f            : -16.86%
satact_m            : -15.16%
total_f_enr         : -5.65%
total_m_enr         : 5.65%
In [20]:
print(f"{'Column Name'.ljust(20)}: PC2 Weight")
for i in range(len(pc2)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*pc2[i]:.2f}%")
Column Name         : PC2 Weight
5_17_poverty_percent: 21.06%
advmath_f_enr       : -10.37%
advmath_m_enr       : -10.68%
advpl_f_noexam      : -8.73%
advpl_m_noexam      : -9.86%
alg1_f_0910_passed  : 34.00%
alg1_f_1112_passed  : 20.76%
alg1_m_0910_passed  : 35.52%
alg1_m_1112_passed  : 21.46%
alg2_f_enr          : 25.15%
alg2_m_enr          : 26.05%
bio_f_enr           : 20.67%
bio_m_enr           : 26.02%
calc_f_enr          : -17.07%
calc_m_enr          : -18.48%
chem_f_enr          : 2.63%
chem_m_enr          : 2.99%
dual_f_enr          : 3.02%
dual_m_enr          : 1.68%
enr_504_f           : -17.38%
enr_504_m           : -15.71%
enr_idea_f          : 2.96%
enr_idea_m          : 6.42%
enr_lep_f           : 9.90%
enr_lep_m           : 10.12%
geo_f_enr           : 29.84%
geo_m_enr           : 31.90%
phys_f_enr          : -1.30%
phys_m_enr          : -2.71%
satact_f            : -1.68%
satact_m            : -1.72%
total_f_enr         : -5.16%
total_m_enr         : 5.16%
In [21]:
inertia = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(standardized_data)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()
No description has been provided for this image
In [22]:
knee = KneeLocator(k_range, inertia, curve="convex", direction="decreasing")

# Elbow point
optimal_k = knee.elbow

print(f"The optimal number of clusters (k) is: {optimal_k}")
The optimal number of clusters (k) is: 4
In [23]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(standardized_data)
In [24]:
enr_cols = []
unique_clusters = np.unique(df['cluster'])
print(f"{'Cluster'.ljust(10)}: LEAs in Dataset")
for cluster in unique_clusters:
    count = np.sum(df['cluster'] == cluster)
    print(f"{str(cluster).ljust(10)}: {count}")
Cluster   : LEAs in Dataset
0         : 60
1         : 2859
2         : 1591
3         : 3305
In [25]:
def lda(X, y):
    mean = X.mean(axis=0)
    class_labels = np.unique(y)
    m, x_m, n = [[],[],[]]
    for cl in class_labels:
        data = X[y == cl]
        m.append(data.mean(axis=0))
        x_m.append(data - m[-1])
        n.append(len(data))
    Sw = sum((xm.T @ xm) for xm in x_m)
    Sb = sum((np.outer(d,d)*n_i) for d, n_i in zip(m-mean,n))
    eigval,eigvec=np.linalg.eig(np.linalg.inv(Sw)@Sb)
    idx = np.argsort(eigval)[::-1]
    return eigval[idx],np.real(eigvec[:,idx])
In [26]:
X = standardized_data
y = df['cluster']
eigval,eigvec = lda(X, y)
X_lda = X@eigvec

# Ensure that X_lda has at least 3 components for 3D plotting
if X_lda.shape[1] < 3:
    # Pad with zeros if fewer than 3 components
    X_lda = np.pad(X_lda, ((0, 0), (0, 3 - X_lda.shape[1])), mode='constant')

# Create an interactive 3D plot using Plotly
trace = go.Scatter3d(
    x=X_lda[:, 0],
    y=X_lda[:, 1],
    z=X_lda[:, 2],
    mode='markers',
    marker=dict(size=5, color=y, opacity=0.8),
    text=[f"LEA ID: {i}, {state}<br>LEA Name: {lea}<br>5_17 Pop: {int(pop)}<br>5_17 Pov: {100*pov:.2f}%" 
          for i, lea, state, pop, pov in zip(ids, lea_names, states, pop5_17, pov5_17)],  
    # Display ID, School Name, and LEA Name when hovering
    hoverinfo="text+x+y+z"
)



layout = go.Layout(
    title="LDA Projection on Top 3 Discriminant Components",
    scene=dict(
        xaxis_title="LDA Component 1",
        yaxis_title="LDA Component 2",
        zaxis_title="LDA Component 3"
    )
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)
In [27]:
extreme_LDA = df.iloc[np.argsort(np.abs(X_lda[:, 0]))[-3:]]
extreme_LDA.T
Out[27]:
4133 3038 1220
leaid 3500690 2636600 1602520
lea_name DEMING PUBLIC SCHOOLS Yale Public Schools OROFINO JOINT DISTRICT
lea_state NM MI ID
advmath_m_enr 0.0 0.0 0.0
advmath_f_enr 0.0 0.0 0.0
advpl_m_noexam 0.016393 0.025994 0.0
advpl_f_noexam 0.006903 0.018349 0.0
alg1_m_0910_passed 0.197584 0.247706 0.092437
alg1_m_1112_passed 0.106989 0.255352 0.588235
alg1_f_0910_passed 0.210526 0.215596 0.058824
alg1_f_1112_passed 0.112166 0.218654 0.226891
alg2_m_enr 0.421053 0.0 0.210084
alg2_f_enr 0.396894 0.0 0.042017
bio_m_enr 0.398619 0.0 0.378151
bio_f_enr 0.385677 0.0 0.142857
calc_m_enr 0.421053 0.0 0.0
calc_f_enr 0.397757 0.0 0.0
chem_m_enr 0.398619 0.0 0.0
chem_f_enr 0.385677 0.0 0.0
dual_m_enr 0.117343 0.0 0.0
dual_f_enr 0.120794 0.0 0.0
total_m_enr 0.497843 0.555046 0.705882
total_f_enr 0.502157 0.444954 0.294118
enr_lep_m 0.144953 0.003058 0.0
enr_lep_f 0.144953 0.0 0.0
enr_504_m 0.0 0.01682 0.0
enr_504_f 0.001726 0.010703 0.0
enr_idea_m 0.037964 0.079511 0.092437
enr_idea_f 0.012942 0.012232 0.033613
geo_m_enr 0.421053 0.0 0.0
geo_f_enr 0.397757 0.0 0.0
phys_m_enr 0.398619 0.0 0.0
phys_f_enr 0.385677 0.0 0.0
satact_m 0.038827 0.0 0.0
satact_f 0.072476 0.0 0.0
totalpopulation 24078.0 11026.0 8799.0
population5_17 4466.0 2032.0 1092.0
population5_17inpoverty 1803.0 252.0 210.0
total_enrollment 1159 654 119
5_17_poverty_percent 0.403717 0.124016 0.192308
cluster 0 0 0
In [28]:
eig1, eig2 =(eigvec.T)[:2] # column = eigvec
exclude_cols.append('cluster')
In [29]:
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(eig1)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*eig1[i]:.2f}%")
Column Name         : PC1 Weight
5_17_poverty_percent: -10.32%
advmath_f_enr       : 13.06%
advmath_m_enr       : 10.28%
advpl_f_noexam      : 6.74%
advpl_m_noexam      : 5.25%
alg1_f_0910_passed  : -1.47%
alg1_f_1112_passed  : -17.86%
alg1_m_0910_passed  : -0.55%
alg1_m_1112_passed  : -5.19%
alg2_f_enr          : 5.74%
alg2_m_enr          : 1.81%
bio_f_enr           : 5.95%
bio_m_enr           : 0.31%
calc_f_enr          : 7.86%
calc_m_enr          : 7.39%
chem_f_enr          : 7.59%
chem_m_enr          : 9.97%
dual_f_enr          : -2.38%
dual_m_enr          : -0.97%
enr_504_f           : 7.13%
enr_504_m           : 8.11%
enr_idea_f          : -3.62%
enr_idea_m          : -1.88%
enr_lep_f           : -0.02%
enr_lep_m           : -0.78%
geo_f_enr           : 4.00%
geo_m_enr           : 3.95%
phys_f_enr          : 18.28%
phys_m_enr          : 17.46%
satact_f            : 4.58%
satact_m            : 4.61%
total_f_enr         : -61.78%
total_m_enr         : -64.66%
In [30]:
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(eig2)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*eig2[i]:.2f}%")
Column Name         : PC1 Weight
5_17_poverty_percent: -1.68%
advmath_f_enr       : 2.21%
advmath_m_enr       : 6.55%
advpl_f_noexam      : 3.06%
advpl_m_noexam      : -0.19%
alg1_f_0910_passed  : -6.89%
alg1_f_1112_passed  : 74.78%
alg1_m_0910_passed  : 9.82%
alg1_m_1112_passed  : 39.42%
alg2_f_enr          : 4.56%
alg2_m_enr          : 7.49%
bio_f_enr           : 2.26%
bio_m_enr           : 0.21%
calc_f_enr          : 3.47%
calc_m_enr          : 3.37%
chem_f_enr          : -1.81%
chem_m_enr          : 7.64%
dual_f_enr          : 1.70%
dual_m_enr          : -2.53%
enr_504_f           : 0.55%
enr_504_m           : 2.34%
enr_idea_f          : 1.28%
enr_idea_m          : -4.16%
enr_lep_f           : -0.51%
enr_lep_m           : -4.35%
geo_f_enr           : 2.75%
geo_m_enr           : 7.48%
phys_f_enr          : -12.06%
phys_m_enr          : 18.60%
satact_f            : 6.49%
satact_m            : -1.25%
total_f_enr         : -27.83%
total_m_enr         : -32.50%

Covariance¶

In [31]:
standardized_df = pd.DataFrame(standardized_data)
standardized_df.columns = df.columns.difference(exclude_cols)
correlation_matrix = standardized_df.cov()
In [32]:
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap="bwr", cbar=True)
plt.title('Correlation Matrix Heatmap')
plt.show()
No description has been provided for this image
In [33]:
covariance_matrix = df[df.columns.difference(exclude_cols)].cov()
plt.figure(figsize=(12, 8))
sns.heatmap(covariance_matrix, annot=False, fmt=".2f", cmap="bwr", cbar=True)
plt.title('Covariance Matrix Heatmap')
plt.show()
No description has been provided for this image

Multiple Regression¶

In [34]:
dependent_var = '5_17_poverty_percent'
independent_vars = df.columns.difference(exclude_cols + [dependent_var])
In [35]:
high_p_vals = ['alg2_f_enr','enr_lep_f','calc_f_enr','total_m_enr',
               'alg1_m_0910_passed','alg1_m_1112_passed','enr_idea_f',
               'advmath_m_enr','enr_504_m','geo_f_enr','advpl_f_noexam',
               'satact_f','chem_m_enr', 'alg1_f_1112_passed']
independent_vars = independent_vars.difference(high_p_vals)
independent_vars
Out[35]:
Index(['advmath_f_enr', 'advpl_m_noexam', 'alg1_f_0910_passed', 'alg2_m_enr',
       'bio_f_enr', 'bio_m_enr', 'calc_m_enr', 'chem_f_enr', 'dual_f_enr',
       'dual_m_enr', 'enr_504_f', 'enr_idea_m', 'enr_lep_m', 'geo_m_enr',
       'phys_f_enr', 'phys_m_enr', 'satact_m', 'total_f_enr'],
      dtype='object')
In [36]:
X = df[independent_vars]
X = sm.add_constant(X)
Y = df[dependent_var]
model = sm.OLS(Y, X).fit()
model.summary()
Out[36]:
OLS Regression Results
Dep. Variable: 5_17_poverty_percent R-squared: 0.241
Model: OLS Adj. R-squared: 0.239
Method: Least Squares F-statistic: 137.4
Date: Fri, 30 Aug 2024 Prob (F-statistic): 0.00
Time: 03:31:17 Log-Likelihood: 8552.5
No. Observations: 7815 AIC: -1.707e+04
Df Residuals: 7796 BIC: -1.693e+04
Df Model: 18
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 0.1331 0.017 7.781 0.000 0.100 0.167
advmath_f_enr -0.1929 0.019 -10.086 0.000 -0.230 -0.155
advpl_m_noexam -0.1836 0.035 -5.315 0.000 -0.251 -0.116
alg1_f_0910_passed 0.0589 0.024 2.421 0.015 0.011 0.107
alg2_m_enr 0.0596 0.024 2.516 0.012 0.013 0.106
bio_f_enr -0.1131 0.030 -3.797 0.000 -0.172 -0.055
bio_m_enr 0.2002 0.030 6.566 0.000 0.140 0.260
calc_m_enr -1.1887 0.042 -28.118 0.000 -1.272 -1.106
chem_f_enr -0.0908 0.022 -4.093 0.000 -0.134 -0.047
dual_f_enr 0.1352 0.028 4.748 0.000 0.079 0.191
dual_m_enr -0.1658 0.030 -5.533 0.000 -0.225 -0.107
enr_504_f -0.6654 0.060 -11.107 0.000 -0.783 -0.548
enr_idea_m 0.0626 0.027 2.285 0.022 0.009 0.116
enr_lep_m 0.4123 0.027 15.406 0.000 0.360 0.465
geo_m_enr 0.0958 0.024 4.000 0.000 0.049 0.143
phys_f_enr 0.2257 0.045 5.005 0.000 0.137 0.314
phys_m_enr -0.2387 0.042 -5.702 0.000 -0.321 -0.157
satact_m -0.0909 0.015 -5.890 0.000 -0.121 -0.061
total_f_enr 0.1163 0.033 3.482 0.000 0.051 0.182
Omnibus: 1220.707 Durbin-Watson: 1.522
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2241.907
Skew: 0.991 Prob(JB): 0.00
Kurtosis: 4.719 Cond. No. 79.0


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [37]:
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
vif_data
Out[37]:
Variable VIF
0 const 347.644766
1 advmath_f_enr 1.178837
2 advpl_m_noexam 1.057485
3 alg1_f_0910_passed 1.107583
4 alg2_m_enr 1.216256
5 bio_f_enr 3.361558
6 bio_m_enr 3.427158
7 calc_m_enr 1.169718
8 chem_f_enr 1.311806
9 dual_f_enr 4.903776
10 dual_m_enr 4.875324
11 enr_504_f 1.087826
12 enr_idea_m 1.084125
13 enr_lep_m 1.035152
14 geo_m_enr 1.240517
15 phys_f_enr 6.472962
16 phys_m_enr 6.762970
17 satact_m 1.083999
18 total_f_enr 1.536176